Data Processing
Reading the data from CSV
library(readr)
library(tidyverse)
sheet1 = read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
Selecting Area_name,STCOU, column ending with “D” column and renaming Area_name
sheet2=sheet1%>%select(Area_name,STCOU,ends_with("D"))%>%rename("area_name"=Area_name)
sheet2
Splitting the enrollment string into measurement and year.
sheet3$Measurements=substr(sheet3$enrollment_value,1,7)
sheet3$Year= as.numeric(format(as.Date(substr(sheet3$enrollment_value,8,9),format="%y"),"%Y"))
sheet3
Creating two data set, county and non-county
sheet_county=sheet3[c(grep(pattern = ", \\w\\w", sheet3$area_name)),]
sheet_not_county=sheet3[-c(grep(pattern = ", \\w\\w", sheet3$area_name)),]
class(sheet_county) = c("county", class(sheet_county))
class(sheet_not_county) = c("state", class(sheet_not_county))
Creating new column of state from county data
sheet_county$state=str_sub(sheet_county$area_name,-2,-1)
sheet_county
Creating division in non-county data
division1=(c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island","Vermont"))
division2=c("New Jersey", "New York","Pennsylvania")
division3=c("Illinois", "Indiana", "Michigan","Ohio","Wisconsin")
division4=c("Iowa", "Kansas", "Minnesota","Missouri", "Nebraska","North Dakota","South Dakota")
division5=c("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
division6=c("ALABAMA", "Kentucky", "Mississippi","Tennessee")
division7=c("Arkansas", "Louisiana", "Oklahoma","Texas")
division8=c("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
division9=c("Alaska", "California", "Hawaii", "Oregon","Washington")
length=nrow(sheet_not_county)
sheet_not_county["division"]=NA
for (i in 1:length){
if (sheet_not_county$area_name[i] %in% toupper(division1)){
sheet_not_county$division[i]="New England"
} else if (sheet_not_county$area_name[i] %in% toupper(division2)){
sheet_not_county$division[i]="mid_Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division3)){
sheet_not_county$division[i]="East North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division4)){
sheet_not_county$division[i]="West North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division5)){
sheet_not_county$division[i]="South Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division6)){
sheet_not_county$division[i]="East South Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division7)){
sheet_not_county$division[i]="West South Central "
}else if (sheet_not_county$area_name[i] %in% toupper(division8)){
sheet_not_county$division[i]="Mountain"
}else if (sheet_not_county$area_name[i] %in% toupper(division9)){
sheet_not_county$division[i]="Pacific"
}else {
sheet_not_county$division[i]="ERROR"
}
}
sheet_not_county
Requirements
Creating individual function from step 1 to 6
part2=read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
function_for_step_1_2= function(part, def_value = "enrollment_value"){
part_1 = part%>%select(Area_name,STCOU,ends_with("D"))%>%rename("area_name"=Area_name)
part_2 = part_1%>%pivot_longer(cols = 3:12,names_to = def_value,values_to = "number")
return(part_2)
}
function_for_step_3= function(part_2){
part_2$Year = as.numeric(format(as.Date(substr(part_2$enrollment_value,8,9),format="%y"),"%Y"))
part_2$Measurements=substr(part_2$enrollment_value,1,7)
return(part_2)
}
func_5 = function(df_county){
df_county$state=str_sub(df_county$area_name,-2,-1)
return (df_county)
}
func_6 = function(sheet_not_county){
division1=(c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island","Vermont"))
division2=c("New Jersey", "New York","Pennsylvania")
division3=c("Illinois", "Indiana", "Michigan","Ohio","Wisconsin")
division4=c("Iowa", "Kansas", "Minnesota","Missouri", "Nebraska","North Dakota","South Dakota")
division5=c("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
division6=c("ALABAMA", "Kentucky", "Mississippi","Tennessee")
division7=c("Arkansas", "Louisiana", "Oklahoma","Texas")
division8=c("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
division9=c("Alaska", "California", "Hawaii", "Oregon","Washington")
length=nrow(sheet_not_county)
sheet_not_county["division"]<-NA
for (i in 1:length){
if (sheet_not_county$area_name[i] %in% toupper(division1)){
sheet_not_county$division[i]="New England"
} else if (sheet_not_county$area_name[i] %in% toupper(division2)){
sheet_not_county$division[i]="mid_Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division3)){
sheet_not_county$division[i]="East North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division4)){
sheet_not_county$division[i]="West North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division5)){
sheet_not_county$division[i]="South Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division6)){
sheet_not_county$division[i]="East South Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division7)){
sheet_not_county$division[i]="West South Central "
}else if (sheet_not_county$area_name[i] %in% toupper(division8)){
sheet_not_county$division[i]="Mountain"
}else if (sheet_not_county$area_name[i] %in% toupper(division9)){
sheet_not_county$division[i]="Pacific"
}else {
sheet_not_county$division[i]="ERROR"
}
}
return (sheet_not_county)
}
function_for_steps4_5_6= function(part_2){
df_county = part_2[c(grep(pattern = ", \\w\\w", part_2$area_name)),]
df_not_county = part_2[-c(grep(pattern = ", \\w\\w", part_2$area_name)),]
class(df_county) = c("county", class(df_county))
class(df_not_county) = c("state", class(df_not_county))
part_5 = func_5(df_county)
part_6 = func_6(df_not_county)
return(list(part_5,part_6))
}
Creating Wrapper function
my_wrapper = function(url, def_value = "enrollment_value"){
a = read_csv(url)
b = function_for_step_1_2(a,def_value)
c = function_for_step_3(b)
d = function_for_steps4_5_6(c)
return(list(d[[1]], d[[2]]))
}
my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
## [[1]]
## # A tibble: 31,450 × 7
## area_name STCOU enrollment_value number Year Measurements state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 31,440 more rows
##
## [[2]]
## # A tibble: 530 × 7
## area_name STCOU enrollment_value number Year Measurements division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 520 more rows
Writing a Generic Function for Summarizing
Plotting not county data with mean value of the enrollment across the years for each Division
plot.state = function(bind_not_county,var_name="number"){
mean_enrollment = bind_not_county %>%
filter(division != "ERROR") %>%
group_by(Year, division) %>%
summarise(Mean = mean(get(var_name),na.rm=TRUE))
ggplot(mean_enrollment, aes(x =Year, y =Mean, color = division)) + geom_line()+geom_point()
}
Plotting county data allowing user to specify the state of interest, arrangement and count based on enrollment
plot.county = function(bind_county, var_name = "number", State = "AZ", arrangement = "top", count = 5){
firstfilter_county = bind_county %>%
filter(state ==State) %>%
group_by(area_name) %>%
summarise(MeanVal = mean(get(var_name), na.rm = TRUE))
if(arrangement == "top"){
firstfilter_county = head(firstfilter_county[order(-firstfilter_county$MeanVal),],count)
}else{
firstfilter_county = head(firstfilter_county[order(firstfilter_county$MeanVal),],count)
}
secondfilter_df=bind_county[bind_county$area_name %in% firstfilter_county$area_name, ]
ggplot(secondfilter_df, aes(x = Year, y = get(var_name),color = area_name)) + geom_line() + geom_point()
}
Put it all together
data_1 = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv",)
data_2 = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
bind_county <- bind_func(data_1, data_2)[[1]]
bind_not_county <- bind_func(data_1, data_2)[[2]]
class(bind_county) = c("county", class(bind_county))
class(bind_not_county) = c("state", class(bind_not_county))
plot.state(bind_not_county)

plot.county(bind_county, State = "PA", arrangement = "top", count = 7)

plot.county(bind_county, State = "PA", arrangement = "bottom", count = 4)

plot.county(bind_county)

plot.county(bind_county, State = "MN", arrangement = "top", count = 10)

first = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv" )
second = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")
third = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv" )
fourth = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv")
FirstSecond = bind_func(first, second)
ThirdFourth = bind_func(third, fourth)
FirsttoFourth = bind_func(FirstSecond, ThirdFourth)
FirsttoFourth
## [[1]]
## # A tibble: 125,800 × 7
## area_name STCOU enrollment_value number Year Measurements state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 PST015171D 25508 1971 PST0151 AL
## 2 Autauga, AL 01001 PST015172D 27166 1972 PST0151 AL
## 3 Autauga, AL 01001 PST015173D 28463 1973 PST0151 AL
## 4 Autauga, AL 01001 PST015174D 29266 1974 PST0151 AL
## 5 Autauga, AL 01001 PST015175D 29718 1975 PST0151 AL
## 6 Autauga, AL 01001 PST015176D 29896 1976 PST0151 AL
## 7 Autauga, AL 01001 PST015177D 30462 1977 PST0151 AL
## 8 Autauga, AL 01001 PST015178D 30882 1978 PST0151 AL
## 9 Autauga, AL 01001 PST015179D 32055 1979 PST0151 AL
## 10 Autauga, AL 01001 PST025181D 31985 1981 PST0251 AL
## # … with 125,790 more rows
##
## [[2]]
## # A tibble: 2,120 × 7
## area_name STCOU enrollment_value number Year Measurements division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 PST015171D 206827028 1971 PST0151 ERROR
## 2 UNITED STATES 00000 PST015172D 209283904 1972 PST0151 ERROR
## 3 UNITED STATES 00000 PST015173D 211357490 1973 PST0151 ERROR
## 4 UNITED STATES 00000 PST015174D 213341552 1974 PST0151 ERROR
## 5 UNITED STATES 00000 PST015175D 215465246 1975 PST0151 ERROR
## 6 UNITED STATES 00000 PST015176D 217562728 1976 PST0151 ERROR
## 7 UNITED STATES 00000 PST015177D 219759860 1977 PST0151 ERROR
## 8 UNITED STATES 00000 PST015178D 222095080 1978 PST0151 ERROR
## 9 UNITED STATES 00000 PST015179D 224567234 1979 PST0151 ERROR
## 10 UNITED STATES 00000 PST025181D 229466391 1981 PST0251 ERROR
## # … with 2,110 more rows
plot.state(FirsttoFourth[[2]])

plot.county(FirsttoFourth[[1]], State = "CT", arrangement = "top", count = 6)

plot.county(FirsttoFourth[[1]], State = "NC", arrangement = "bottom", count = 10)

plot.county(FirsttoFourth[[1]])

plot.county(FirsttoFourth[[1]], State = "MN", arrangement = "top", count = 4)
